In this notebook, we provide insight into data by creating tables and plots.
import numpy as np
import pandas as pd
import json
import pickle
import os
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from IPython.display import display
from wordcloud import WordCloud
from collections import Counter
from typing import Tuple, List
import folium
import folium.plugins as plugins
def set_pandas_display_options() -> None:
"""Set pandas display options."""
display = pd.options.display
display.max_columns = 500
display.max_rows = 500
display.max_colwidth = 100
display.width = None
set_pandas_display_options()
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
# Import Dataset
with open('Nevada.pkl', 'rb') as nevada:
Nevada = pickle.load(nevada)
display(Nevada.head(3))
Let us take a look on the users by seeing the activity of the most active of them.
user_agg=Nevada.groupby(['user_id','user_name']).agg({'review_id':['count'],'date':['min','max'],'review_stars':['mean']})
user_agg=user_agg.sort_values([('review_id','count')],ascending=False)
print(" Top 10 Users in Yelp Dataset")
user_agg.head(10)
#x.to_excel(r'top_users.xlsx')
cities = Nevada.city
# Calculate the number of times the city appears
counts = cities.value_counts()
# Calculate number of unique businesses per city
unique_businesses = Nevada.groupby('city')['business_id'].nunique()
# Calculate the average stars per city
avg_stars = round(Nevada.groupby('city')['review_stars'].mean(),2)
# Calculate number of average reviews per city
avg_reviews = round((counts/unique_businesses),2)
table=pd.DataFrame({'Number of Reviews':counts,
'Average Number of Reviews':avg_reviews,
'Number of Businesses':unique_businesses,
'Average stars per review':avg_stars}).sort_values(by=['Number of Businesses'], ascending=False)
display(table)
Las Vegas, in the state of Nevada, has the most businesses followed by Henderson city. It is worth mentioning the high number of reviews that characterize the businesses of Las Vegas. What is more, every city has over 100 reviews average number of reviews per business.
# Create interactive map with default basemap
data=[]
# Rearrange data to suit the format needed for folium
stars_list=list(Nevada['review_stars'].unique())
for star in stars_list:
subset=Nevada[Nevada['review_stars']==star]
data.append(subset[['latitude','longitude']].values.tolist())
# Initialize at Las Vegas (Google Coordinates)
lat = 36.127430
lon = -115.138460
zoom_start=10
print("Las Vegas Review Map")
# basic map
m = folium.Map(location=[lat, lon], tiles="OpenStreetMap", zoom_start=zoom_start)
# Show variations across star ratings
hm = plugins.HeatMapWithTime(data,max_opacity=0.3,auto_play=True,display_index=True,radius=7)
hm.add_to(m)
m